This notebook demonstrates some of the basic functionality of librosa version 0.3.
Following through this example, you'll learn how to:
In [1]:
# We'll need the os module for file path manipulation
import os
# And numpy for some mathematical operations
import numpy as np
# Librosa for audio
import librosa
# matplotlib for displaying the output
import matplotlib.pyplot as plt
%matplotlib inline
# and IPython.display for audio output
import IPython.display
In [2]:
audio_path = librosa.util.example_audio_file()
# or uncomment the line below and point it at your favorite song:
#
# audio_path = '/path/to/your/favorite/song.mp3'
y, sr = librosa.load(audio_path)
By default, librosa will resample the signal to 22050Hz.
You can change this behavior by setting the sr parameter of librosa.load(), or disable resampling entirely by setting sr=None.
You might want to stop here to verify that scikits.samplerate is installed and functioning correctly. Without this, audio resampling will fall back on scipy.signal.resample(), which is rather inefficient.
You can test this by printing out the librosa.core._HAS_SAMPLERATE flag:
In [3]:
print 'HAS_SAMPLERATE: ', librosa.core._HAS_SAMPLERATE
This first step will show how to compute a Mel spectrogram from an audio waveform.
In [4]:
# Let's make and display a mel-scaled power (energy-squared) spectrogram
# We use a small hop length of 64 here so that the frames line up with the beat tracker example below.
S = librosa.feature.melspectrogram(y, sr=sr, n_fft=2048, hop_length=64, n_mels=128)
# Convert to log scale (dB). We'll use the peak power as reference.
log_S = librosa.logamplitude(S, ref_power=np.max)
# Make a new figure
plt.figure(figsize=(12,4))
# Display the spectrogram on a mel scale
# sample rate and hop length parameters are used to render the time axis
librosa.display.specshow(log_S, sr=sr, hop_length=64, x_axis='time', y_axis='mel')
# Put a descriptive title on the plot
plt.title('mel power spectrogram')
# draw a color bar
plt.colorbar(format='%+02.0f dB')
# Make the figure layout compact
plt.tight_layout()
In [5]:
y_harmonic, y_percussive = librosa.effects.hpss(y)
In [6]:
# What do the spectrograms look like?
# Let's make and display a mel-scaled power (energy-squared) spectrogram
# We use a small hop length of 64 here so that the frames line up with the beat tracker example below.
S_harmonic = librosa.feature.melspectrogram(y_harmonic, sr=sr, n_fft=2048, hop_length=64, n_mels=128)
S_percussive = librosa.feature.melspectrogram(y_percussive, sr=sr, n_fft=2048, hop_length=64, n_mels=128)
# Convert to log scale (dB). We'll use the peak power as reference.
log_Sh = librosa.logamplitude(S_harmonic, ref_power=np.max)
log_Sp = librosa.logamplitude(S_percussive, ref_power=np.max)
# Make a new figure
plt.figure(figsize=(12,6))
plt.subplot(2,1,1)
# Display the spectrogram on a mel scale
librosa.display.specshow(log_Sh, y_axis='mel')
# Put a descriptive title on the plot
plt.title('mel power spectrogram (Harmonic)')
# draw a color bar
plt.colorbar(format='%+02.0f dB')
plt.subplot(2,1,2)
librosa.display.specshow(log_Sp, sr=sr, hop_length=64, x_axis='time', y_axis='mel')
# Put a descriptive title on the plot
plt.title('mel power spectrogram (Percussive)')
# draw a color bar
plt.colorbar(format='%+02.0f dB')
# Make the figure layout compact
plt.tight_layout()
Next, we'll extract Chroma features to represent pitch information.
In [7]:
# We'll use a longer FFT window here to better resolve low frequencies
# We'll use the harmonic component to avoid pollution from transients
C = librosa.feature.chromagram(y=y_harmonic, sr=sr, n_fft=4096, hop_length=64)
# Make a new figure
plt.figure(figsize=(12,4))
# Display the chromagram: the energy in each chromatic pitch class as a function of time
# To make sure that the colors span the full range of chroma values, set vmin and vmax
librosa.display.specshow(C, sr=sr, hop_length=64, x_axis='time', y_axis='chroma', vmin=0, vmax=1)
plt.title('Chromagram')
plt.colorbar()
plt.tight_layout()
Mel-frequency cepstral coefficients are commonly used to represent texture or timbre of sound.
In [8]:
# Next, we'll extract the top 20 Mel-frequency cepstral coefficients (MFCCs)
mfcc = librosa.feature.mfcc(S=log_S, n_mfcc=20)
# Let's pad on the first and second deltas while we're at it
delta_mfcc = librosa.feature.delta(mfcc)
delta2_mfcc = librosa.feature.delta(mfcc, order=2)
# How do they look? We'll show each in its own subplot
plt.figure(figsize=(12, 6))
plt.subplot(3,1,1)
librosa.display.specshow(mfcc)
plt.ylabel('MFCC')
plt.colorbar()
plt.subplot(3,1,2)
librosa.display.specshow(delta_mfcc)
plt.ylabel('MFCC-$\Delta$')
plt.colorbar()
plt.subplot(3,1,3)
librosa.display.specshow(delta2_mfcc, sr=sr, hop_length=64, x_axis='time')
plt.ylabel('MFCC-$\Delta^2$')
plt.colorbar()
plt.tight_layout()
# For future use, we'll stack these together into one matrix
M = np.vstack([mfcc, delta_mfcc, delta2_mfcc])
In the above examples, Mel power spectrogram is negative-valued (dB relative to peak power), and specshow() defaults to a purple->white color gradient.
The chromagram example is positive-valued, and specshow() will default to a white->red color gradient.
If the input data has both positive and negative values, as in the MFCC example, then a purple->white->orange diverging color gradient will be used.
These defaults have been selected to ensure readability in print (grayscale) and are color-blind friendly.
Just as in pyplot.imshow(), the color map can be overriden by setting the cmap keyword argument.
In [9]:
# Now, let's run the beat tracker
# We'll use the percussive component for this part
tempo, beats = librosa.beat.beat_track(y=y_percussive, sr=sr, hop_length=64)
# Let's re-draw the spectrogram, but this time, overlay the detected beats
plt.figure(figsize=(12,4))
librosa.display.specshow(log_S, sr=sr, hop_length=64, x_axis='time', y_axis='mel')
# Let's draw lines with a drop shadow on the beat events
plt.vlines(beats, 0, log_S.shape[0], colors='k', linestyles='-', linewidth=2.5)
plt.vlines(beats, 0, log_S.shape[0], colors='w', linestyles='-', linewidth=1.5)
plt.axis('tight')
plt.tight_layout()
By default, the beat tracker will trim away any leading or trailing beats that don't appear strong enough.
To disable this behavior, call beat_track() with trim=False.
In [10]:
print 'Estimated tempo: %.2f BPM' % tempo
print 'First 5 beat frames: ', beats[:5]
# Frame numbers are great and all, but when do those beats occur?
print 'First 5 beat times: ', librosa.frames_to_time(beats[:5], sr=sr, hop_length=64)
# We could also get frame numbers from times by librosa.time_to_frames()
In [11]:
# feature.sync will summarize each beat event by the mean feature vector within that beat
M_sync = librosa.feature.sync(M, beats)
plt.figure(figsize=(12,6))
# Let's plot the original and beat-synchronous features against each other
plt.subplot(2,1,1)
librosa.display.specshow(M)
plt.title('MFCC-$\Delta$-$\Delta^2$')
# We can also use pyplot *ticks directly
# Let's mark off the raw MFCC and the delta features
plt.yticks(np.arange(0, M.shape[0], 20), ['MFCC', '$\Delta$', '$\Delta^2$'])
plt.colorbar()
plt.subplot(2,1,2)
librosa.display.specshow(M_sync)
# librosa can generate axis ticks from arbitrary timestamps and beat events also
librosa.display.time_ticks(librosa.frames_to_time(beats, sr=sr, hop_length=64))
plt.yticks(np.arange(0, M_sync.shape[0], 20), ['MFCC', '$\Delta$', '$\Delta^2$'])
plt.title('Beat-synchronous MFCC-$\Delta$-$\Delta^2$')
plt.colorbar()
plt.tight_layout()
In [12]:
# Beat synchronization is flexible.
# Instead of computing the mean delta-MFCC within each beat, let's do beat-synchronous chroma
# We can replace the mean with any statistical aggregation function, such as min, max, or median.
C_sync = librosa.feature.sync(C, beats, aggregate=np.median)
plt.figure(figsize=(12,6))
plt.subplot(2, 1, 1)
librosa.display.specshow(C, sr=sr, hop_length=64, y_axis='chroma', vmin=0.0, vmax=1.0)
plt.title('Chroma')
plt.colorbar()
plt.subplot(2, 1, 2)
librosa.display.specshow(C_sync, y_axis='chroma', vmin=0.0, vmax=1.0)
beat_times = librosa.frames_to_time(beats, sr=sr, hop_length=64)
librosa.display.time_ticks(beat_times)
plt.title('Beat-synchronous Chroma (median aggregation)')
plt.colorbar()
plt.tight_layout()